/*          
    Purpose: Clean demographic variables, find actual 
    		 income (1-year) and binned family income, 
    		 and find coarsened modal retrospective father 
    		 occupation for linked adult children in the 
    		 PSID who are heads of their household.

    Creates: PSID_sons_retrospective.dta
    		 PSID_fathers_retrospective.dta
*/
clear
set more off
cd "$Mydirectory1/1_DataSources/PSID"

**----------------------------------------------------**
**----------------------------------------------------**

********************************
** BRING IN DATA AND RESHAPE
********************************

use ./RawData/PSID_raw_indfam.dta, clear
	drop HHlabor* h_w_inc* maritalstatus* selfemployed* union* h_w_acc* grade* agehead* mainocc* intnumber* HHunioncontract* ER* V* laborinc*

* Unique identifier (wide)
	gen son_id = famid*1000 + personnumber
	order son_id, before(famid)
	label var son_id "Child 1968 ID"
	
/*  Following Mazumder paper, only want certain parts of the PSID: 
    the main nationally representative sample (SRC). 
	The SEO sample (5000<famid<7000) is used in robustness checks. 
	The immigrant samples have values between 3000 and 5000, 
	and Latino families added later have values >7000. 
	
	Source: https://psidonline.isr.umich.edu/guide/faq.aspx   */
	gen src_sample = famid<3000
	gen seo_sample = famid>5000 & famid<7000
	keep if src_sample==1 | seo_sample==1 


*------------------------------------*
* MERGE IN OTHER RELEVANT VARIABLES
*------------------------------------*
* Geographic variables 
	merge 1:1 son_id using ./RawData/geopsid.dta
	drop head* foreign* moved* state_childhood*
	drop if _merge==2
	drop _merge
	
* Retrospective father occupation for household heads
	merge 1:1 son_id using ./RawData/retrospective_info.dta
	drop if _merge==2
	drop _merge
	
* Cross-sectional weights
	merge 1:1 son_id using ./RawData/xsection_weights_info.dta
	drop if _merge==2
	drop _merge	
	
* Changes in family composition 
	merge 1:1 son_id using ./RawData/fam_change_info.dta
	drop if _merge==2
	drop _merge
	
/* PSID FIMS sample of respondents who can be linked 
   to a father */
   	merge 1:1 son_id using ./FIMS/FIMSFathersKids_SRC_SEO.dta
	keep if _merge==3
	drop _merge
	order father_id, after(son_id)
	
* Reshape to long	
	#delimit ; 
	reshape long age relate state race famweight indweight famweight_coreimm father_occ_ totfaminc fatherforeign region4_childhood
			 indweight_coreimm num_ sequencenumber new_head_ family_comp_ xsection_weight_, i(son_id father_id sex) j(year);
	#delimit cr
	
	sort son_id year
	
**----------------------------------------------------**
**----------------------------------------------------**

***********************
* CLEAN DEMOGRAPHIC VARIABLES
***********************

* Head of household
	gen relate2 = relate
	gen hh_head = (relate2==1 & year<1983) | (relate2==10 & year>=1983)
	drop relate*

*----------------------------------------*	
*----------------------------------------*
	drop if hh_head==0 //keep only heads
	/*Note: All downloaded demographic 
	        and occupation variables are 
	        asked of the head. */
*----------------------------------------*
*----------------------------------------*

* Weight: harmonize individual weight variables
	replace indweight = indweight_coreimm if year>=1997 & year<=2015
	label var indweight "PSID longitudinal weight, son"
	
	rename xsection_weight_ xsection_weight
	label var xsection_weight "PSID x section weight, starting in 97"
	
* Age

	//Construct consistent age variable
	replace age=. if age==0 | age==999
	gen byr_s = year - age
	by son_id: egen byr_s2 = min(byr_s) //take minimum of all possible birth years
	drop byr_s
	label var byr_s2 "Birth year (using minimum)"
	gen age_s = year - byr_s2 - 1 
	replace age_s=. if age_s<=0 | age_s>100
	label var age_s "Age of child"
	
* Clean race variable 
	replace race=. if race>2 & race<8 
	replace race=. if race==0 | race==9

* Grab most commonly reported race	
	bysort son_id: egen race_son=mode(race), minmode
	label var race_son "Modal race of son"
	drop race
		
* Grew up in the South	
	gen south_temp = region4_childhood==3 if region4_childhood<.
	
	bysort son_id: egen south_retrospective = max(south_temp)
	label var south_retrospective "Respondent ever said grew up in South"
	
	drop south_temp

**----------------------------------------------------**
**----------------------------------------------------**

*************************************
* CLEAN/CONSTRUCT INCOME VARIABLES
*************************************
	
	clonevar totfaminc_og = totfaminc //preserve original version
	replace totfaminc_og=0 if totfaminc_og<=0

	//Fix income variable
	replace totfaminc=. if totfaminc<=0 
	replace totfaminc=. if indweight==0 //exclude observations w/o weight
	label var totfaminc "Total family income"
	
    *---------------------------*
    /* MAZUMDER APPROACH TO 
       FINDING ACTUAL INCOME */
    *---------------------------*
	sort son_id year age_s
 
    * Center around age 40
	local son_center=40
	local center `son_center'

    * Look between 30 and 50
	local son_band=10
	local band `son_band'

    * # of observations that'll be tagged (1)
	local son_count=1 
	local obs_ct `son_count'
	
	foreach var of varlist totfaminc {

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
		gen indson_`var' = 0
		replace indson_`var' = 1 if `var'!=. & age_s==`center'
		
        * Find total of this binary for each person
		bysort son_id: egen total_indson_`var' = total(indson_`var')

        /* Now iteratively search the bands starting at the center. 
           Tag the observation in the sample if it's non-missing and 
           the respondent has not reached the the observation count (1,5, or 10). 
           The total number of observations used is then recalculated. 
           Note that the upper band is arbitrarily privileged. 
        */
		forval i = 1/`band' {
         * Upper band (41-50)
			replace indson_`var' = 1 if `var'!=. & age_s==(`center'+`i') & (total_indson_`var'<`obs_ct')
			
         * Re-calculate the total observations used
			drop total_indson_`var'
			by son_id: egen total_indson_`var' = total(indson_`var') 

         * Lower band (30-39)
			replace indson_`var' = 1 if `var'!=. & age_s==(`center'-`i') & total_indson_`var'<`obs_ct'
			
         * Re-calculate the total observations used
			drop total_indson_`var'
			by son_id: egen total_indson_`var' = total(indson_`var') 
	}

	* Drop observations without at least one year of viable income 
		drop if total_indson_`var'<`obs_ct'

	* Grab the tagged year of income and give it to the respondent in every year 
		gen chosen_`var' = `var' if indson_`var'==1
		by son_id: egen child_`var' = min(chosen_`var')
		label var child_`var' "`var' chosen around age 40 for child"

	* Grab the age of the tagged observation and give it to the respondent in every year
		gen chosen_age_`var' = age_s if indson_`var'==1
		by son_id: egen age_child_`var' = min( chosen_age_`var' )
		label var age_child_`var' "Age at which `var' chosen around age 40 for child"
		
	* Grab the survey year of the tagged observation and give it to the respondent in every year
		gen chosen_year_`var' = year if indson_`var'==1
		by son_id: egen year_child_`var' = min( chosen_year_`var' )
		label var year_child_`var' "Year at which `var' chosen around age 40 for child"
		
	* Grab the weight of the tagged observation and give it to the respondent in every year
		gen chosen_weight_`var' = indweight if indson_`var'==1
		by son_id: egen weight_`var' = min(chosen_weight_`var')
		label var weight_`var' "Weight at which `var' chosen around age 40 for child" 
	}

	drop total_indson indson chosen_* 

* Rename a couple variables before binning
	clonevar year_totfaminc_og = year
	rename year_child_totfaminc year_totfaminc_age40 
	rename child_totfaminc totfaminc_age40

*---------------------------------------------*
* PUT INCOMES IN BINS (FOLLOWING GSS BINS)
*---------------------------------------------*

foreach var of varlist totfaminc_age40 totfaminc_og {

	gen income77_bins_`var'=.
	replace income77_bins_`var'=750 if `var'>=0 & `var'<1000 //<1000
	replace income77_bins_`var'=2000 if `var'>=1000 & `var'<3000 //1-3k
	replace income77_bins_`var'=4000 if `var'>=3000 & `var'<5000 //3-5k
	replace income77_bins_`var'=6000 if `var'>=5000 & `var'<7000 //5-7
	replace income77_bins_`var'=8500 if `var'>=7000 & `var'<10000 //7-10
	replace income77_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15
	replace income77_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income77_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income77_bins_`var'=37500 if `var'>=25000 & `var'<50000 //25-50 
	replace income77_bins_`var'=1.25*50000 if `var'>=50000 & `var'<. //50+ 
	replace income77_bins_`var'=. if year_`var'>=1982

	gen income82_bins_`var'=.
	replace income82_bins_`var'=750 if `var'>=0 & `var'<1000 //<1000
	replace income82_bins_`var'=2000 if `var'>=1000 & `var'<3000 //1-3k
	replace income82_bins_`var'=4000 if `var'>=3000 & `var'<5000 //3-5k
	replace income82_bins_`var'=6000 if `var'>=5000 & `var'<7000 //5-7
	replace income82_bins_`var'=8500 if `var'>=7000 & `var'<10000 //7-10
	replace income82_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15
	replace income82_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income82_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income82_bins_`var'=30000 if `var'>=25000 & `var'<35000 //25-35 
	replace income82_bins_`var'=42500 if `var'>=35000 & `var'<50000 //35-50 
	replace income82_bins_`var'=1.25*50000 if `var'>=50000 & `var'<. //50+ 
	replace income82_bins_`var'=. if year_`var'<1982 | year_`var'>=1986

	gen income86_bins_`var'=.
	replace income86_bins_`var'=0.75*4000 if `var'>=0 & `var'<4000 //<4k
	replace income86_bins_`var'=5000 if `var'>=4000 & `var'<6000 //4-6k
	replace income86_bins_`var'=7000 if `var'>=6000 & `var'<8000 //6-8k
	replace income86_bins_`var'=10250 if `var'>=8000 & `var'<12500 //8-12.5
	replace income86_bins_`var'=15000 if `var'>=12500 & `var'<17500 //12.5-17.5
	replace income86_bins_`var'=20000 if `var'>=17500 & `var'<22500 //17.5-22.5
	replace income86_bins_`var'=26250 if `var'>=22500 & `var'<30000 //22.5-30
	replace income86_bins_`var'=35000 if `var'>=30000 & `var'<40000 //30-40
	replace income86_bins_`var'=50000 if `var'>=40000 & `var'<60000 //40-60 
	replace income86_bins_`var'=1.25*60000 if `var'>=60000 & `var'<. //60+ 
	replace income86_bins_`var'=. if year_`var'<1986 | year_`var'>=1991
	
	gen income91_bins_`var'=.
	replace income91_bins_`var'=0.75*5000 if `var'>=0 & `var'<5000 //<5k
	replace income91_bins_`var'=7500 if `var'>=5000 & `var'<10000 //5-7k change to 5-10
	replace income91_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15k
	replace income91_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income91_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income91_bins_`var'=30000 if `var'>=25000 & `var'<35000 //25-35
	replace income91_bins_`var'=40000 if `var'>=35000 & `var'<45000 //35-45
	replace income91_bins_`var'=50000 if `var'>=45000 & `var'<55000 //45-55
	replace income91_bins_`var'=60000 if `var'>=55000 & `var'<65000 //55-65
	replace income91_bins_`var'=70000 if `var'>=65000 & `var'<75000 //65-75
	replace income91_bins_`var'=80000 if `var'>=75000 & `var'<85000 //75-85
	replace income91_bins_`var'=1.25*85000 if `var'>=85000 & `var'<. //85+ 
	replace income91_bins_`var'=. if year_`var'<1991 | year_`var'>=1998 

	gen income98_bins_`var'=.
	replace income98_bins_`var'=0.75*6000 if `var'>=0 & `var'<6000 //<6k
	replace income98_bins_`var'=8000 if `var'>=6000 & `var'<10000 //6-10k
	replace income98_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15k
	replace income98_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income98_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income98_bins_`var'=30000 if `var'>=25000 & `var'<35000 //25-35
	replace income98_bins_`var'=42500 if `var'>=35000 & `var'<50000 //35-50
	replace income98_bins_`var'=62500 if `var'>=50000 & `var'<75000 //50-75 
	replace income98_bins_`var'=92500 if `var'>=75000 & `var'<110000 //75-110 
	replace income98_bins_`var'=1.25*110000 if `var'>=110000 & `var'<. //110+ 
	replace income98_bins_`var'=. if year_`var'<1998 | year_`var'>=2006

	gen income06_bins_`var'=.
	replace income06_bins_`var'=0.75*10000 if `var'>=0 & `var'<10000 //<10k
	replace income06_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15k
	replace income06_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income06_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income06_bins_`var'=30000 if `var'>=25000 & `var'<35000 //25-35
	replace income06_bins_`var'=42500 if `var'>=35000 & `var'<50000 //35-50
	replace income06_bins_`var'=62500 if `var'>=50000 & `var'<75000 //50-75 
	replace income06_bins_`var'=92500 if `var'>=75000 & `var'<110000 //75-110 
	replace income06_bins_`var'=130000 if `var'>=110000 & `var'<150000 //110-150 
	replace income06_bins_`var'=1.25*150000 if `var'>=150000 & `var'<. //150k+
	replace income06_bins_`var'=. if year_`var'<2006 | year_`var'>=2015
	
	gen income16_bins_`var'=.
	replace income16_bins_`var'=0.75*10000 if `var'>=0 & `var'<10000 //<10k
	replace income16_bins_`var'=12500 if `var'>=10000 & `var'<15000 //10-15k
	replace income16_bins_`var'=17500 if `var'>=15000 & `var'<20000 //15-20
	replace income16_bins_`var'=22500 if `var'>=20000 & `var'<25000 //20-25
	replace income16_bins_`var'=30000 if `var'>=25000 & `var'<35000 //25-35
	replace income16_bins_`var'=42500 if `var'>=35000 & `var'<50000 //35-50
	replace income16_bins_`var'=62500 if `var'>=50000 & `var'<75000 //50-75 
	replace income16_bins_`var'=92500 if `var'>=75000 & `var'<110000 //75-110 
	replace income16_bins_`var'=130000 if `var'>=110000 & `var'<150000 //110-150
	replace income16_bins_`var'=160000 if `var'>=150000 & `var'<170000 //150-170 
	replace income16_bins_`var'=1.25*170000 if `var'>=170000 & `var'<. //170k+
	replace income16_bins_`var'=. if year_`var'<2015
	
* Create 1 harmonized, binned measure (based on survey year)
	gen `var'_bucket=.
	replace `var'_bucket=income77_bins_`var' if year_`var'<1982
	replace `var'_bucket=income82_bins_`var' if year_`var'>=1982 & year_`var'<=1985
	replace `var'_bucket=income86_bins_`var' if year_`var'>=1986 & year_`var'<=1990 
	replace `var'_bucket=income91_bins_`var' if year_`var'>=1991 & year_`var'<=1997
	replace `var'_bucket=income98_bins_`var' if year_`var'>=1998 & year_`var'<=2005
	replace `var'_bucket=income06_bins_`var' if year_`var'>=2006 & year_`var'<=2013
	replace `var'_bucket=income16_bins_`var' if year_`var'==2015
	label var `var'_bucket "Income (`var') of child, in GSS buckets"

}

rename totfaminc_og son_totfaminc 
rename totfaminc_og_bucket son_totfaminc_bin
rename totfaminc_age40_bucket son_totfaminc_age40_bin
rename totfaminc_age40 son_totfaminc_age40

drop income77* income82* income86* income91* income98* income06* income16*  totfaminc

*------------------------------------------*
* Convert binned income measures to 1950$
*------------------------------------------*

* Total family income (any year)
	gen year_CPI = year-1 
	merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
		drop if _merge==2
		drop _merge
		
	replace son_totfaminc = son_totfaminc*deflator
	label var son_totfaminc "Total family income, 1950 dollars"
	replace son_totfaminc_bin = son_totfaminc_bin*deflator
	label var son_totfaminc_bin "Total family income, 1950 dollars"

	drop year_CPI CPI deflator
	
* Total family income in tagged year around age 40
	gen year_CPI = year_totfaminc_age40-1
	merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
		drop if _merge==2
		drop _merge

	replace son_totfaminc_age40 = son_totfaminc_age40*deflator
	label var son_totfaminc_age40 "Total family income around age 40, 1950 dollars"
	replace son_totfaminc_age40_bin = son_totfaminc_age40_bin*deflator
	label var son_totfaminc_age40_bin "Total family income around age 40, 1950 dollars"

	drop year_CPI CPI deflator
	

*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

*********************************
* CROSSWALK FATHER OCCUPATION
*********************************

* Keep 1997 onwards 
	/* Note: 1997 is first year that retrospective 
	         father occcupation is asked. */
	keep if year>=1997 
	
	tab father_occ_ if year<2002
	
/* Crosswalk PSID occupations (1970 Census-based) 
   to coarsened ANES occupations */
	clonevar census1970 = father_occ_
	replace census1970=. if year>2002
	replace census1970=. if (census1970==0 | census1970>990) & year<2002
	
	merge m:1 census1970 using ../Crosswalks/Crosswalk_1970Census_toANES.dta
	assert census1970==. if _merge==1
	drop if _merge==2	
	drop _merge
	
/* Crosswalk PSID occupations (2000 Census-based) 
   to coarsened ANES occupations */
	clonevar occ2000 = father_occ_
	replace occ2000=. if year<2002
	replace occ2000=. if (occ2000==0 | occ2000>990) & year>2002
	
	merge m:1 occ2000 using ../Crosswalks/Crosswalk_2000Census_toANES.dta
	assert occ2000==. if _merge==1
	drop if _merge==2
	drop _merge
	
* Create harmonized father occupation variable
	replace fatheroccej = fatheroccej_2000 if year>2002
	
	drop fatheroccej_2000 occ2000 census1970
	
/* Keep individuals who provided father occupation 
   at least once */
	gen temp = fatheroccej<.
	bysort son_id: egen number_answers = sum(temp)
	tab number_answers
	
	keep if number_answers>=1
	label var number_answers "Number of retrospective answers"
	drop temp
	
	rename fatheroccej occ1950ej
	
*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

************************************************
* FIND RETROSPECTIVE, MODAL FATHER OCCUPATION 
************************************************

	sort son_id year
	
	bysort son_id: egen mode_occ_son = mode(occ1950ej) 
	label var mode_occ_son "Modal (retrospective) occupation"
	
* Binary: no mode
	gen no_mode =1 if mode_occ_son==.
	label var no_mode "No modal (retrospective) occupation available"
	
* Modal occupation (max)
	bysort son_id: egen mode_occ_son_max = mode(occ1950ej), maxmode
	label var mode_occ_son_max "Modal (retrospective) occupation, forcing (max) choice"
	
* Modal occupation when adult child is between 30 and 50
	gen temp1 = occ1950ej if age_s>=30 & age_s<=50
	
	bysort son_id: egen mode_occ_son30to50_max = mode(temp1), maxmode
	label var mode_occ_son30to50_max "Modal (retrospective) occupation when son is 30-50, max choice"
	
	bysort son_id: egen mode_occ_son30to50_min = mode(temp1), minmode
	label var mode_occ_son30to50_min "Modal (retrospective) occupation when son is 30-50, min choice"
	
* Modal occupation between years 1997 and 2003
	gen temp11 = occ1950ej if year<2002
	
	bysort son_id: egen mode_occ_sonpre2002 = mode(temp11)
	label var mode_occ_sonpre2002 "Modal (retrospective) occupation before 2002 survey"

/* By respondent: Count how many times they 
                  recalled modal occupation */
	gen temp2 = mode_occ_son == occ1950ej
	
	bysort son_id: egen number_mode = sum(temp2)
	gen share = number_mode / number_answers
	replace share=. if no_mode==1
	sum share, d
	
	drop temp*
	label var number_mode "Number of times responded modal answer"
	label var share "Share of times son said modal occ."
	
* Rename retrospective, non-modal father occupation
	rename occ1950ej father_occ_retrospective
	label var father_occ_retrospective "Retrospective occupation"
	
/* Find all retrospective father occupations 
   listed per adult child respondent */

	gen occ_temp = father_occ_retrospective 
	
		//Obtain min and max retrospective answer
		bysort son_id: egen occ_1 = min(occ_temp)
		bysort son_id: egen occ_2 = max(occ_temp)
		replace occ_2=. if occ_2==occ_1
		
		//Erase those possibilities and get new min and max
		replace occ_temp=. if (occ_temp==occ_1 | occ_temp==occ_2) 
		
		bysort son_id: egen occ_3 = min(occ_temp)
		bysort son_id: egen occ_4 = max(occ_temp)
		replace occ_4=. if occ_4==occ_3

		//Erase those possibilities and get new min and max
		replace occ_temp=. if (occ_temp==occ_3 | occ_temp==occ_4) 

		bysort son_id: egen occ_5 = min(occ_temp)
		bysort son_id: egen occ_6 = max(occ_temp)
		replace occ_6=. if occ_6==occ_5
		
		replace occ_temp=. if (occ_temp==occ_5 | occ_temp==occ_6) 
		tab occ_temp 
		tab occ_6
		drop occ_6 occ_temp
	
*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

**********************
* SAVE
**********************

	//Keep 1 observation per adult child respondent
	sort son_id year
	by son_id: gen firstobs_son = _n==1
	label var firstobs_son "First observation, son"
	
	keep son_id firstobs_son year* number_answers number_mode share no_mode mode* race* age_s south* father_occ_retrospective ///
	indweight *_totfaminc byr_s2 sex *_bin occ_* son_totfaminc* xsection_weight father_id 
		
	compress
	save ./output/PSID_sons_retrospective.dta, replace

	//Save dataset of father ids	
	preserve
	keep father_id byr_s2
	bysort father_id: keep if _n==1
	
	save ./output/PSID_fathers_retrospective.dta, replace
	restore